/**********************************************************************/
/*
   Author: Robbie Dulin, Michelle Han
   Created: 5 May 2021
   Last Updated: Apr 2024
   Description: Cleans raw Feb 2019 SAKERNAS data.

   This cleaning file should output 2 different datasets:
   1. Cleaned SAKERNAS data:
   sak_feb19_deid_clean
   2. Subset of SAKERNAS person-batch data matched with PMO for SAKERNAS analysis:
  sak_feb19_deid_clean_merged
*/
/**********************************************************************/
  * include filepaths 
  if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
  * Log
  cap log close
  global prefix: display %tdCYND td(`c(current_date)')
  log using "$KP_logs/${prefix}_clean_SAKERNAS_feb2019.txt", text replace

  clear
  set more off

* drop observations that are missing everything
  u "$KP_deid_sakernas/Raw/SAKERNAS_PRAKERJA_19FEB_deid.dta", clear
  di _N
  count if tahun == .
  drop if tahun == .
  di _N

  rename weightr_sp weight


/*----------------------------------------------------*/
              /* Section: Demographics */
/*----------------------------------------------------*/

* NOTE: no HH id in this survey
* number of HH members
  gen hh_size = b2_r1

* year of birth
  gen year_dob = b4_k7_th

* age
  summ b4_k8
  gen age = b4_k8
  recode age (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* relationship to HH head
  tab b4_k3, m
  gen relation_hh_head = b4_k3
  la def relations 1 "HH head" 2 "Spouse" 3 "Son/Daughter" 4 "Step/adopted child" 5 "Son/Daughter-in-law" 6 "Grandchild" 7 "Parent/Parent-in-law" 8 "Other family" 9 "Housemaid" 10 "Driver/Gardener" 11 "No relation", replace
  la val relation_hh_head relations
  tab relation_hh_head

* marital status
  tab b4_k10, m
  gen married = b4_k10 == 2
  gen divorced = b4_k10 == 3
  gen widowed = b4_k10 == 4
  gen single = b4_k10 == 1

* gender
  tab b4_k6, m
  gen female = b4_k6 == 2

* gender (version 2)
  gen sex = b4_k6 == 1
  label def sex 1 "male"  0 "female"
  label val sex sex

* current student
  tab b4_k9, m
  gen current_student = b4_k9 == 2

* education levels
  tab b5_r1a, m
  gen no_elementary = b5_r1a == 1
  gen elementary = 2 <= b5_r1a & b5_r1a <= 4
  gen junior_high = 5 <= b5_r1a & b5_r1a <= 7
  gen high_school = 8 <= b5_r1a & b5_r1a <= 11
  gen tertiary = 12 <= b5_r1a & b5_r1a <= 16
  summ no_elementary elementary junior_high high_school tertiary

* education -> years of schooling
  gen educ = b5_r1a
  gen school_years = 3 if educ == 1 // No elementary
  replace school_years = 6 if inlist(educ, 2, 3, 4) // elementary (2-4)
  replace school_years = 9 if inlist(educ, 5, 6, 7) // junior high (5-7)
  replace school_years = 12 if inlist(educ, 8, 9, 10, 11) // high (8-11)
  replace school_years = 13.5 if educ == 12 // Diploma I/II
  replace school_years = 15 if educ == 13 // Diploma III
  replace school_years = 16 if educ == 14 // Diploma IV / Bachelor
  replace school_years = 18 if educ == 15 // Master
  replace school_years = 21 if educ == 16 // Doctor

* Dummy for educated (graduating high school)
  gen educated = school_years > 12 
  la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
  la val educated educated
  
* training courses
  tab b5_r1d, m
  gen train_certif = b5_r1d == 1

  tab b5_r1f, m
  gen current_training = b5_r1f == 1

* migration (don't know current kabupaten, so can only use province to compare)
  tab b5_r3a, m
  gen migrated = kode_prov !=  b5_r3a
  tab migrated, m

* disabilities
  tab b5_r4a, m
  gen vision_disabled = b5_r4a == 2 | b5_r4a == 3
  gen vision_disability = b5_r4a

  tab b5_r4b, m
  gen hearing_disabled = b5_r4b == 5 | b5_r4b == 6
  gen hearing_disability = b5_r4b

  tab b5_r4c, m
  gen walk_disabled = b5_r4c == 2 | b5_r4c == 3
  gen walk_disability = b5_r4c

  tab b5_r4d, m
  gen hand_disabled = b5_r4d == 5 | b5_r4d == 6
  gen hand_disability = b5_r4d

  tab b5_r4e, m
  gen speech_disabled = b5_r4e == 2 | b5_r4e == 3
  gen speech_disability = b5_r4e

  tab b5_r4f, m
  gen other_disabled = b5_r4f == 5 | b5_r4f == 6
  gen other_disability = b5_r4f

  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + other_disabled) >= 1
  tab any_disability, m

  gen severe_disability = b5_r4a == 3 | b5_r4b == 6 | b5_r4c == 3 | b5_r4d == 6 | b5_r4e == 3 | b5_r4f == 6
  tab severe_disability

* urban-rural
  tab klasifikas, m
  gen urban = klasifikas == 1


/*----------------------------------------------------*/
                /* Section: Employment */
/*----------------------------------------------------*/

* employed (according to survey definition)
  * employed if worked uninterrupted hour last week
  tab b5_r5a1, m
  gen employed = b5_r5a1 == 1

  * employed if usually works uninterrupted hour, but temporarily not at work
  tab b5_r6, m
  replace employed = 1 if b5_r6 == 1

  * employed if worked cumulative hour but not uninterrupted hour
  tab b5_r7a, m
  replace employed = 1 if b5_r7a == 1

  * employed if usually work cumulative hour, but temporarily not at work
  tab b5_r7b, m
  replace employed = 1 if b5_r7b == 1
  tab employed, m

* employment status (note: includes those who answered no to work for income, but answered that they did other work)
  // employment block asked of all who enumerator deem to have worked
  tab b5_r24a, m
  assert b5_r24a == . if employed == 0
  gen employment_status = b5_r24a
  replace employment_status = 0 if b5_r24a == .
  label define status 0 "Not Working" 1 "Self-employed" 2 "Business owner with temporary/unpaid workers" 3 "Business owner with paid workers" 4 "Employee" 5 "Temporary worker in agriculture" 6 "Temporary worker (non-agriculture)" 7 "Family/unpaid worker", replace
  la val employment_status status
  tab employment_status

  * Type of work: the categories including permanent job, self employed, business owner, etc (12A)
  gen self_employed = employment_status == 1
  gen business_owner = inlist(employment_status, 2, 3)
  gen self_emp_bus_owner = inlist(employment_status, 1, 2, 3)
  gen perm_worker = employment_status == 4
  gen temp_worker = inlist(employment_status, 5, 6)
  gen family_unpaid = employment_status == 7

  replace employed = inlist(employment_status, 1, 2, 3, 4, 5, 6)

  * Note: 1 or 3 = yes, 2 or 4 = no, 0 = not asked
  gen reg_job_mkt = b5_r16a == 1
  gen contact_company = b5_r16b == 3
  gen advert_online = b5_r16d == 3
  gen search_network = b5_r16e == 1
  gen cap_loc_lic = b5_r16f == 3 | b5_r16g == 1 | b5_r16h == 3
  gen any_attempt = b5_r16i == 1

* Business field
  // 17-sector classification: (https://www.bps.go.id/statictable/2016/01/06/1898/klasifikasi-17-sektor-tabel-input-output-indonesia-2010.html)
  tab b5_r20_kat, m
  la def business_fields 1 "Agriculture, Forestry and Fisheries" 2 "Mining and excavation" 3 "Industrial Production" 4 "Electricity and Gas" ///
    5 "Water Supply, Waste Management, and Recycling" 6 "Construction" 7 "Wholesale and Retail Trade, Car and Motorcycle Repair" ///
    8 "Transportation and Warehousing" 9 "Hospitality and Restaurants" 10 "Infomation and Communication" 11 "Financial Services and Insurance" ///
    12 "Real Estate" 13 "Business Services" 14 "Government Administration, Defense and Social Security" 15 "Education" 16 "Health and Social Services" 17 "Other", replace
  gen business_field = b5_r20_kat
  la val business_field business_fields
  tab business_field, m

* KBJI code (https://www.bps.go.id/website/fileMenu/KBJI-2014.pdf)
  // 1982 KJI single digit
  tab b5_r21_kji, m

  // KBJI 2014 single digit
  tab b5_r21_kbj, m
  la def occupations 0 "Army and Police" 1 "Manager" 2 "Professional" 3 "Technicians and Assistant Professionals" 4 "Administrative Personnel" 5 "Business Services and Sales Personnel" 6 "Skilled Workers in Agriculture, Forestry and Fisheries" 7 "Production, Craft, and Related Workers" 8 "Machine Operators and Assembly Workers" 9 "Blue-collar workers", replace
  gen occupation = b5_r21_kbj
  la val occupation occupations
  tab occupation, m

* wage last month (assuming this is January)
  summ b5_r28b1 b5_r28b2 if inlist(employment_status, 1, 5, 6)
  summ b5_r28c1 b5_r28c2 if inlist(employment_status, 4)
  gen wage_month = b5_r28b1 + b5_r28b2 + b5_r28c1 + b5_r28c2 if inlist(employment_status, 1, 4, 5, 6)
  summ wage_month if inlist(employment_status, 1, 4, 5, 6), d

* Individual earnings, including zero if not employed
  gen ind_earnings = wage_month
  replace ind_earnings = 0 if employed == 0
  sum ind_earnings, d

  gen ind_adj_earnings = ind_earnings
  sum ind_adj_earnings if ind_adj_earnings > 0, d
  replace ind_adj_earnings = r(p99) if ind_adj_earnings > r(p99) & !mi(ind_adj_earnings)
  replace ind_adj_earnings = r(p1) if ind_adj_earnings < r(p1) & !mi(ind_adj_earnings) & ind_adj_earnings != 0
  sum ind_adj_earnings, d
  gen ind_pos_adj_earnings = ind_adj_earnings if ind_adj_earnings > 0

  gen ind_earnings_pos = ind_earnings > 0 if !mi(ind_earnings)

* hours worked last week
  summ b5_r23a? if inlist(employment_status, 1, 4, 5, 6)
  summ b5_r23a if inlist(employment_status, 1, 4, 5, 6)
  di `r(max)'/7
  // max is 14 hours per day
  hist b5_r23a
  gen hours_worked = b5_r23a
  assert employed == 0 if mi(hours_worked)
  replace hours_worked = 0 if mi(hours_worked)
  summ hours_worked if hours_worked != 0

* hourly wage (income per day / hours per day)
  * raw wages and hours
  gen hourly_wage = (wage_month / 31) / (hours_worked / 7)
  replace hourly_wage = 0 if hours_worked == 0
  summ hourly_wage, d

  * Trim 1% and 99% percentiles
  gen hourly_adj_wage = hourly_wage
  sum hourly_adj_wage if hourly_adj_wage > 0, d
  replace hourly_adj_wage = r(p99) if hourly_adj_wage > r(p99) & !mi(hourly_adj_wage)
  replace hourly_adj_wage = r(p1) if hourly_adj_wage < r(p1) & !mi(hourly_adj_wage) & hourly_adj_wage != 0
  sum hourly_adj_wage, d

  // Exchange rate: 14,403.60 rp/$
  // median monthly salary (assuming 40 hours/day, 5 days/week)
  di `r(p50)' * 4 * 5 * 8
  di `r(p50)' * 4 * 5 * 8 / 14400

* work tenure
  tab b5_r22a1, m
  tab b5_r22a2i, m
  tab b5_r22a2ii, m

  * indicators for whether job tenure is less than or greater than 12 months
  gen tenure_year_or_less = b5_r22a1 != .
  gen tenure_year_more =  b5_r22a2i != .
  tab tenure_year_or_less tenure_year_more

  * for month variable for those with tenure > 1 year, set to middle of year if value is 99 (assume this is don't know)
  gen b5_r22a2ii_no99 = b5_r22a2ii
  replace b5_r22a2ii_no99 = 6 if b5_r22a2ii == 99

  * create tenure in months
  // first the total months for those with tenure < 1 year
  gen tenure_months = b5_r22a1 if tenure_year_or_less == 1
  summ tenure_months

  // next the total months for those with tenure > 1 year
  replace tenure_months = (12 * b5_r22a2i) + b5_r22a2ii_no99 if tenure_year_more == 1
  summ tenure_months
  count if tenure_months == .
  replace tenure_months = 0 if tenure_months == .
  drop tenure_year_or_less tenure_year_more b5_r22a2ii_no99
  summ tenure_months, d

  * check
  gen age_months = 12 * age
  // subtract 48 months to get number of months one has been at least 4 years old (approximately)
  replace age_months = age_months - 48
  // check that respondents do not report being at same job since from before they were 4 years old
  summ age_months
  assert tenure_months < age_months if tenure_months != .

* use internet at work
  tab b5_r25b, m
  // make missing if not working
  gen use_internet_work = b5_r25b == 1
  replace use_internet_work = 0 if employed == 0
  tab use_internet_work

* had more than one job
  tab b5_r37a, m
  tab b5_r37b, m
  gen multiple_jobs = b5_r37a == 1 | b5_r37b == 1
  tab multiple_jobs, m

* leave job in past year
  tab b5_r48, m
  gen leave_job = b5_r48 == 1

* job search/business creation
  tab b5_r12a, m
  gen looking_job = b5_r12a == 1

  tab b5_r12b, m
  gen prep_new_bus = b5_r12b == 1
  *bys hh_id_s: egen prep_bus_hh = max(prep_new_bus)

* labor force
  tab employed, m
  gen labor_force = employed == 1 | looking_job == 1 | prep_new_bus == 1
  tab labor_force, m

******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe
  
/*----------------------------------------------------*/
        /* Section: Export cleaned data */
/*----------------------------------------------------*/
  
  datasignature 
  if "`r(datasignature)'" == "196383:262(24500):3707710237:2410186800" {
    save "$KP_deid_sakernas/Clean/sak_feb19_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
  


/*----------------------------------------------------*/
        * Merge with admin data and export
/*----------------------------------------------------*/

  keep if !mi(userid_bps) | !mi(anon_id4)

  drop b1_* b2_* b4_* b5_*

  gen sak_round = 2

 drop date_incentive
  
* merge
  preserve
	keep if !missing(anon_id4)
	tempfile sak_anon_id
	save `sak_anon_id'
  restore

  use "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", clear

  merge m:1 anon_id4 using `sak_anon_id', nogen keep(3)


	compress
  datasignature 
  if "`r(datasignature)'" == "44035:155(94397):2093090154:4051163820" {
    save "$KP_deid_sakernas/Clean/sak_feb19_deid_clean_merged.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	

  // done
